Homer Closest TSS per Gene Data and Heatmaps¶
2-6-26 Update
In [1]:
import pandas as pd
import numpy as np
In [2]:
cd GEO/RNA-seq/pca_ma_plots/final/
/vf/users/gallegosda/GEO/RNA-seq/pca_ma_plots/final
In [3]:
homerClosestTSSperGene_df = pd.read_csv("homer.closestTSS_perGene.csv", names=['geneName','var2','var3','var4','var5','var6','var7','var8','var9','var10','var11','var12','var13','var14','var15','var16','var17','var18','var19'])
In [4]:
homerClosestTSSperGene_df.head(3)
Out[4]:
| geneName | var2 | var3 | var4 | var5 | var6 | var7 | var8 | var9 | var10 | var11 | var12 | var13 | var14 | var15 | var16 | var17 | var18 | var19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Car4 | AW456718|Ca4 | carbonic anhydrase 4 | 443 | chr11 | 84957788 | 84958215 | + | 0 | NaN | intron (NM_007607, intron 1 of 7) | CpG | 247 | NM_007607 | 12351 | Mm.1641 | NM_007607 | ENSMUSG00000000805 | protein-coding |
| 1 | Ppp3ca | 2900074D19Rik|CN|Caln|Calna|CnA | protein phosphatase 3, catalytic subunit, alph... | 1885 | chr3 | 136670472 | 136671032 | + | 0 | NaN | 5' UTR (NM_008913, exon 1 of 14) | 5' UTR (NM_008913, exon 1 of 14) | 686 | NM_008913 | 19055 | Mm.331389 | NM_008913 | ENSMUSG00000028161 | protein-coding |
| 2 | Nr5a1 | Ad4BP|ELP|ELP-3|Ftz-F1|Ftzf1|SF-1|SF1|STF-1 | nuclear receptor subfamily 5, group A, member 1 | 1550 | chr2 | 38711554 | 38711989 | + | 0 | NaN | intron (NM_139051, intron 1 of 6) | CpG-8071 | 463 | NM_001316687 | 26423 | Mm.31387 | NM_139051 | ENSMUSG00000026751 | protein-coding |
In [5]:
allMouseGenes_df = pd.read_csv("all_mouse_gene_ENSEMBL_IDs_and_gene_names.txt",names=['ensemblID','stableID','stableTranscript','transcriptV','geneName'],skiprows=1)
In [6]:
allMouseGenes_df
Out[6]:
| ensemblID | stableID | stableTranscript | transcriptV | geneName | |
|---|---|---|---|---|---|
| 0 | ENSMUSG00000064336 | ENSMUSG00000064336.1 | ENSMUST00000082387 | ENSMUST00000082387.1 | mt-Tf |
| 1 | ENSMUSG00000064337 | ENSMUSG00000064337.1 | ENSMUST00000082388 | ENSMUST00000082388.1 | mt-Rnr1 |
| 2 | ENSMUSG00000064338 | ENSMUSG00000064338.1 | ENSMUST00000082389 | ENSMUST00000082389.1 | mt-Tv |
| 3 | ENSMUSG00000064339 | ENSMUSG00000064339.1 | ENSMUST00000082390 | ENSMUST00000082390.1 | mt-Rnr2 |
| 4 | ENSMUSG00000064340 | ENSMUSG00000064340.1 | ENSMUST00000082391 | ENSMUST00000082391.1 | mt-Tl1 |
| ... | ... | ... | ... | ... | ... |
| 278391 | ENSMUSG00000026833 | ENSMUSG00000026833.19 | ENSMUST00000152415 | ENSMUST00000152415.2 | Olfm1 |
| 278392 | ENSMUSG00000026833 | ENSMUSG00000026833.19 | ENSMUST00000113920 | ENSMUST00000113920.8 | Olfm1 |
| 278393 | ENSMUSG00000026833 | ENSMUSG00000026833.19 | ENSMUST00000100244 | ENSMUST00000100244.10 | Olfm1 |
| 278394 | ENSMUSG00000026833 | ENSMUSG00000026833.19 | ENSMUST00000102879 | ENSMUST00000102879.4 | Olfm1 |
| 278395 | ENSMUSG00000026833 | ENSMUSG00000026833.19 | ENSMUST00000028177 | ENSMUST00000028177.11 | Olfm1 |
278396 rows × 5 columns
In [7]:
# This function returns a df from a file like TEtranscripts_GRCm38_E10_777tm1d_2KO_males_vs_1WT_female_1WT_male_non_stranded_gene_TE_analysis.txt
# Receives .txt as input and returns df
def getGeneNameForTEtranscripts_txt(TEtranscripts_txtFile):
df = pd.read_csv(TEtranscripts_txtFile, sep='\t',index_col=False,names=['ensemblID','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj'],skiprows=1)
df["ensemblID"] = (
df["ensemblID"]
.astype("string") # pandas StringDtype (not plain str)
.str.replace(r"\..*", "", regex=True)
)
df = df.merge(
allMouseGenes_df[["geneName", "ensemblID"]],
on="ensemblID",
how="left"
)
df = df.drop_duplicates()
return df
In [8]:
# df = pd.read_csv("TEtranscripts_GRCm38_E10_777tm1d_2KO_males_vs_1WT_female_1WT_male_non_stranded_gene_TE_analysis.txt", sep='\t',index_col=False,names=['ensemblID','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj'],skiprows=1)
# # print(df.head)
# # Rename the first column (at index 0)
# # df = df.rename(columns={df.columns[0]: 'ensemblID'})
# df["ensemblID"] = (
# df["ensemblID"]
# .astype("string") # pandas StringDtype (not plain str)
# .str.replace(r"\..*", "", regex=True)
# )
# # print(df.head)
# # df
# df = df.merge(
# allMouseGenes_df[["geneName", "ensemblID"]],
# on="ensemblID",
# how="left"
# )
# df = df.drop_duplicates()
# df
In [9]:
mMEF_E15_aKOvWT_males_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mMEF_E15_777KO_tm1a_vs_WT_males_non_stranded_gene_TE_analysis.txt")
In [10]:
mMEF_E15_aKOvWT_females_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mMEF_E15_777KO_tm1a_vs_WT_females_non_stranded_gene_TE_analysis.txt")
In [11]:
mMEF_E15_patientKIvE16WT_females_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mMEF_E15_777-R297W-KI_vs_E16_WT_females_non_stranded_gene_TE_analysis.txt")
In [12]:
mMEF_E15_patientKIvE15WT_males_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mMEF_E15_777-R297W-KI_vs_E15_WT_males_non_stranded_gene_TE_analysis.txt")
In [13]:
mMEF_E13_clusterKOvWT_males_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mMEF_E13_DUFKZFP_cluster_KO_vs_WT_males_non_stranded_gene_TE_analysis.txt")
In [14]:
mF9_OE_patientpCMV6vpSBmock_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mF9_OE_pCMV6_777-R297W-HA_vs_pSB_mock_non_stranded_gene_TE_analysis.txt")
In [15]:
mF9_OE_777vpSBmock_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mF9_OE_pCMV6_777-HA_vs_pSB_mock_non_stranded_gene_TE_analysis.txt")
In [16]:
mESC_KO_EGFPexcisedvWT_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_mESC_R1_777KO_EGFP_excised_vs_WT_non_stranded_gene_TE_analysis.txt")
In [17]:
E8_aKOvWT_males_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_E8_777tm1a_KO_vs_WT_males_non_stranded_gene_TE_analysis.txt")
In [18]:
E8_aKOvWT_females_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_E8_777tm1a_KO_vs_WT_females_non_stranded_gene_TE_analysis.txt")
In [19]:
E10_dKOvWT_mf_df = getGeneNameForTEtranscripts_txt("TEtranscripts_GRCm38_E10_777tm1d_2KO_males_vs_1WT_female_1WT_male_non_stranded_gene_TE_analysis.txt")
In [ ]:
In [20]:
mMEF_E15_aKOvWT_males_df.loc[mMEF_E15_aKOvWT_males_df['geneName']=='Acadl']
Out[20]:
| ensemblID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | geneName | |
|---|---|---|---|---|---|---|---|---|
| 29030 | ENSMUSG00000026003 | 1309.852514 | 0.029923 | 0.222277 | 0.13462 | 0.892913 | 0.981316 | Acadl |
In [21]:
mF9_OE_patientpCMV6vpSBmock_df.loc[mF9_OE_patientpCMV6vpSBmock_df['geneName']=='Acadl']
Out[21]:
| ensemblID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | geneName | |
|---|---|---|---|---|---|---|---|---|
| 28735 | ENSMUSG00000026003 | 3563.687259 | -0.059661 | 0.074049 | -0.805698 | 0.420417 | 0.999695 | Acadl |
In [22]:
mMEF_E15_patientKIvE16WT_females_df.loc[mMEF_E15_patientKIvE16WT_females_df['geneName']=='Morc4']
Out[22]:
| ensemblID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | geneName | |
|---|---|---|---|---|---|---|---|---|
| 46493 | ENSMUSG00000031434 | 840.227535 | -1.272542 | 0.2929 | -4.344633 | 0.000014 | 0.000212 | Morc4 |
In [23]:
mMEF_E13_clusterKOvWT_males_df.loc[mMEF_E13_clusterKOvWT_males_df['geneName']=='Lrrc7']
Out[23]:
| ensemblID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | geneName | |
|---|---|---|---|---|---|---|---|---|
| 36227 | ENSMUSG00000028176 | 7.217716 | 0.354474 | 2.071381 | 0.171129 | 0.864122 | 0.999969 | Lrrc7 |
| 120807 | ENSMUSG00000104597 | 12.254025 | -0.081791 | 1.531621 | -0.053402 | 0.957412 | 0.999969 | Lrrc7 |
In [ ]:
In [24]:
mMEF_E15_aKOvWT_males_df
Out[24]:
| ensemblID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | geneName | |
|---|---|---|---|---|---|---|---|---|
| 0 | ENSMUSG00000000001 | 3123.590474 | -0.012997 | 0.225714 | -0.057583 | 9.540809e-01 | 9.836959e-01 | Gnai3 |
| 1 | ENSMUSG00000000028 | 350.340286 | 0.305006 | 0.279101 | 1.092816 | 2.744747e-01 | 7.623687e-01 | Cdc45 |
| 5 | ENSMUSG00000000031 | 21314.408858 | -2.401234 | 0.274553 | -8.745967 | 2.211141e-18 | 2.637364e-15 | H19 |
| 26 | ENSMUSG00000000037 | 19.745926 | -1.137889 | 0.799598 | -1.423075 | 1.547143e-01 | 6.026881e-01 | Scml2 |
| 35 | ENSMUSG00000000049 | 39.011157 | -0.248033 | 0.586197 | -0.423122 | 6.722060e-01 | 9.259562e-01 | Apoh |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 128928 | ZP3AR:Satellite:Satellite | 626.975354 | -0.610377 | 0.323508 | -1.886742 | 5.919505e-02 | 3.684686e-01 | NaN |
| 128929 | Zaphod2:hAT-Tip100:DNA | 0.912345 | 3.191318 | 3.832658 | 0.832664 | 4.050341e-01 | 8.648981e-01 | NaN |
| 128930 | Zaphod3:hAT-Tip100:DNA | 16.753550 | -0.781496 | 0.890346 | -0.877744 | 3.800825e-01 | 8.518641e-01 | NaN |
| 128931 | Zaphod:hAT-Tip100:DNA | 13.651162 | -0.571859 | 0.995754 | -0.574297 | 5.657667e-01 | 9.105791e-01 | NaN |
| 128932 | hAT-N1_Mam:hAT:DNA | 10.122079 | -0.684578 | 1.095539 | -0.624878 | 5.320514e-01 | 9.105791e-01 | NaN |
25048 rows × 8 columns
In [25]:
homerClosestTSSperGene_GeneNamesOnly_df = homerClosestTSSperGene_df[['geneName']]
In [26]:
# Test join
df = homerClosestTSSperGene_GeneNamesOnly_df.merge(
mMEF_E15_aKOvWT_males_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
In [27]:
# Rename a specific column 'B' to 'New_B'
df = df.rename(columns={'log2FoldChange': 'mMEF_E15_aKOvWT_males_log2FC'})
In [28]:
df
Out[28]:
| geneName | mMEF_E15_aKOvWT_males_log2FC | |
|---|---|---|
| 0 | Car4 | 1.339315 |
| 1 | Ppp3ca | 0.406624 |
| 2 | Nr5a1 | -4.865477 |
| 3 | Bbs5 | 0.998045 |
| 4 | Ppp3cc | 0.358830 |
| ... | ... | ... |
| 2798 | Rell1 | 0.260231 |
| 2799 | Car2 | -1.666589 |
| 2800 | Efna2 | -0.752565 |
| 2801 | Rell2 | 0.210392 |
| 2802 | Efna3 | 0.516740 |
2803 rows × 2 columns
In [29]:
# More joins
df = df.merge(
mMEF_E15_aKOvWT_females_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'mMEF_E15_aKOvWT_females_log2FC'})
df = df.merge(
mMEF_E15_patientKIvE16WT_females_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'mMEF_E15_patientKIvE16WT_females_log2FC'})
df = df.merge(
mMEF_E15_patientKIvE15WT_males_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'mMEF_E15_patientKIvE15WT_males_log2FC'})
df = df.merge(
mMEF_E13_clusterKOvWT_males_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'mMEF_E13_clusterKOvWT_males_log2FC'})
df = df.merge(
mF9_OE_patientpCMV6vpSBmock_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'mF9_OE_patientpCMV6vpSBmock_log2FC'})
df = df.merge(
mF9_OE_777vpSBmock_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'mF9_OE_777vpSBmock_log2FC'})
df = df.merge(
mESC_KO_EGFPexcisedvWT_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'mESC_KO_EGFPexcisedvWT_log2FC'})
df = df.merge(
E8_aKOvWT_males_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'E8_aKOvWT_males_log2FC'})
df = df.merge(
E8_aKOvWT_females_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'E8_aKOvWT_females_log2FC'})
#
df = df.merge(
E10_dKOvWT_mf_df[["geneName", "log2FoldChange"]],
on="geneName",
how="left"
)
df = df.rename(columns={'log2FoldChange': 'E10_dKOvWT_mf_log2FC'})
In [30]:
df
Out[30]:
| geneName | mMEF_E15_aKOvWT_males_log2FC | mMEF_E15_aKOvWT_females_log2FC | mMEF_E15_patientKIvE16WT_females_log2FC | mMEF_E15_patientKIvE15WT_males_log2FC | mMEF_E13_clusterKOvWT_males_log2FC | mF9_OE_patientpCMV6vpSBmock_log2FC | mF9_OE_777vpSBmock_log2FC | mESC_KO_EGFPexcisedvWT_log2FC | E8_aKOvWT_males_log2FC | E8_aKOvWT_females_log2FC | E10_dKOvWT_mf_log2FC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Car4 | 1.339315 | -1.011783 | -3.589685 | NaN | -3.683206 | -1.305721 | -0.273210 | -0.489246 | 0.524201 | 0.194931 | 0.375135 |
| 1 | Ppp3ca | 0.406624 | -0.104624 | 0.001710 | 0.450459 | -0.276674 | -0.118119 | -0.211948 | 0.231611 | -0.149899 | -0.124822 | 0.226837 |
| 2 | Nr5a1 | -4.865477 | 1.510755 | -3.627988 | -4.042709 | 1.480193 | 0.222983 | 0.401732 | 0.104631 | 0.309612 | 1.176672 | -0.184091 |
| 3 | Bbs5 | 0.998045 | 0.012541 | 0.765101 | 0.823408 | 0.262260 | -0.027248 | -0.039019 | -0.415166 | 0.031988 | 0.147180 | -0.056199 |
| 4 | Ppp3cc | 0.358830 | 0.194157 | 0.157946 | 0.003984 | 0.298265 | -0.048729 | -0.261857 | 0.509793 | 0.209369 | 0.113688 | 0.548918 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11299 | Rell1 | 0.260231 | 0.282502 | 0.037566 | -0.301917 | 0.371903 | -0.111070 | -0.219812 | -0.082160 | -0.033426 | -0.008482 | 0.019051 |
| 11300 | Car2 | -1.666589 | 0.676888 | 0.011434 | 0.868281 | -0.657800 | 0.102580 | 0.150735 | -0.158000 | 0.132282 | -0.062851 | 0.284113 |
| 11301 | Efna2 | -0.752565 | 0.018711 | -1.173163 | -1.734658 | 0.266176 | -0.028690 | 0.064358 | 0.048713 | -0.273028 | -0.010901 | -0.299740 |
| 11302 | Rell2 | 0.210392 | 0.180841 | 0.070738 | -0.472328 | 0.469701 | 0.032110 | -0.152993 | -0.254215 | 0.218178 | 0.336738 | 0.453236 |
| 11303 | Efna3 | 0.516740 | 0.057861 | 1.523769 | 0.957994 | -0.837704 | 0.041005 | 0.292736 | -0.330265 | -0.341390 | -0.029855 | 0.069505 |
11304 rows × 12 columns
In [31]:
df = df.drop_duplicates()
In [39]:
df.loc[df['geneName']=='Lrrc7']
Out[39]:
| geneName | mMEF_E15_aKOvWT_males_log2FC | mMEF_E15_aKOvWT_females_log2FC | mMEF_E15_patientKIvE16WT_females_log2FC | mMEF_E15_patientKIvE15WT_males_log2FC | mMEF_E13_clusterKOvWT_males_log2FC | mF9_OE_patientpCMV6vpSBmock_log2FC | mF9_OE_777vpSBmock_log2FC | mESC_KO_EGFPexcisedvWT_log2FC | E8_aKOvWT_males_log2FC | E8_aKOvWT_females_log2FC | E10_dKOvWT_mf_log2FC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 409 | Lrrc7 | -1.919093 | -0.668156 | -0.992237 | -1.742988 | 0.354474 | 0.663596 | 0.29079 | 0.226616 | 0.015112 | -0.003619 | 1.194863 |
| 410 | Lrrc7 | -1.919093 | -0.668156 | -0.992237 | -1.742988 | 0.354474 | 0.663596 | 0.29079 | 0.226616 | 0.015112 | -0.003619 | -0.354013 |
| 411 | Lrrc7 | -1.919093 | -0.668156 | -0.992237 | -1.742988 | 0.354474 | 0.663596 | 0.29079 | 0.226616 | 0.015112 | -0.126133 | 1.194863 |
| 412 | Lrrc7 | -1.919093 | -0.668156 | -0.992237 | -1.742988 | 0.354474 | 0.663596 | 0.29079 | 0.226616 | 0.015112 | -0.126133 | -0.354013 |
| 413 | Lrrc7 | -1.919093 | -0.668156 | -0.992237 | -1.742988 | 0.354474 | 0.663596 | 0.29079 | 0.226616 | -0.112248 | -0.003619 | 1.194863 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 660 | Lrrc7 | 0.027423 | -0.210362 | 0.229261 | -0.060135 | -0.081791 | 0.663596 | 0.29079 | 0.226616 | 0.015112 | -0.126133 | -0.354013 |
| 661 | Lrrc7 | 0.027423 | -0.210362 | 0.229261 | -0.060135 | -0.081791 | 0.663596 | 0.29079 | 0.226616 | -0.112248 | -0.003619 | 1.194863 |
| 662 | Lrrc7 | 0.027423 | -0.210362 | 0.229261 | -0.060135 | -0.081791 | 0.663596 | 0.29079 | 0.226616 | -0.112248 | -0.003619 | -0.354013 |
| 663 | Lrrc7 | 0.027423 | -0.210362 | 0.229261 | -0.060135 | -0.081791 | 0.663596 | 0.29079 | 0.226616 | -0.112248 | -0.126133 | 1.194863 |
| 664 | Lrrc7 | 0.027423 | -0.210362 | 0.229261 | -0.060135 | -0.081791 | 0.663596 | 0.29079 | 0.226616 | -0.112248 | -0.126133 | -0.354013 |
256 rows × 12 columns
In [33]:
# df.to_csv('homerClosestTSSperGene_allTEtranscripts_log2FC.csv',index=False)
Heatmaps¶
In [34]:
homerClosestTSSperGene_df
Out[34]:
| geneName | var2 | var3 | var4 | var5 | var6 | var7 | var8 | var9 | var10 | var11 | var12 | var13 | var14 | var15 | var16 | var17 | var18 | var19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Car4 | AW456718|Ca4 | carbonic anhydrase 4 | 443 | chr11 | 84957788 | 84958215 | + | 0 | NaN | intron (NM_007607, intron 1 of 7) | CpG | 247 | NM_007607 | 12351 | Mm.1641 | NM_007607 | ENSMUSG00000000805 | protein-coding |
| 1 | Ppp3ca | 2900074D19Rik|CN|Caln|Calna|CnA | protein phosphatase 3, catalytic subunit, alph... | 1885 | chr3 | 136670472 | 136671032 | + | 0 | NaN | 5' UTR (NM_008913, exon 1 of 14) | 5' UTR (NM_008913, exon 1 of 14) | 686 | NM_008913 | 19055 | Mm.331389 | NM_008913 | ENSMUSG00000028161 | protein-coding |
| 2 | Nr5a1 | Ad4BP|ELP|ELP-3|Ftz-F1|Ftzf1|SF-1|SF1|STF-1 | nuclear receptor subfamily 5, group A, member 1 | 1550 | chr2 | 38711554 | 38711989 | + | 0 | NaN | intron (NM_139051, intron 1 of 6) | CpG-8071 | 463 | NM_001316687 | 26423 | Mm.31387 | NM_139051 | ENSMUSG00000026751 | protein-coding |
| 3 | Bbs5 | 1700049I01Rik|2700023J09Rik | Bardet-Biedl syndrome 5 (human) | 1569 | chr2 | 69647067 | 69647652 | + | 0 | NaN | intron (NM_028284, intron 1 of 11) | CpG | 188 | NM_001362706 | 72569 | Mm.252136 | NM_028284 | ENSMUSG00000063145 | protein-coding |
| 4 | Ppp3cc | Calnc|PP2BA gamma | protein phosphatase 3, catalytic subunit, gamm... | 922 | chr14 | 70289129 | 70289377 | + | 0 | NaN | 5' UTR (NM_001360229, exon 1 of 13) | 5' UTR (NM_001360229, exon 1 of 13) | 244 | NM_001304992 | 19057 | Mm.439683 | NM_008915 | ENSMUSG00000022092 | protein-coding |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2793 | Rell1 | AA536743 | RELT-like 1 | 2181 | chr5 | 63968509 | 63968817 | + | 0 | NaN | exon (NM_145923, exon 1 of 7) | exon (NM_145923, exon 1 of 7) | 234 | NM_145923 | 100532 | Mm.243632 | NM_145923 | ENSMUSG00000047881 | protein-coding |
| 2794 | Car2 | AI131712|CAII|Ca2|Car-2|Ltw-5|Lvtw-5 | carbonic anhydrase 2 | 1743 | chr3 | 14886255 | 14887104 | + | 0 | NaN | 5' UTR (NM_009801, exon 1 of 7) | 5' UTR (NM_009801, exon 1 of 7) | 391 | NM_009801 | 12349 | Mm.1186 | NM_009801 | ENSMUSG00000027562 | protein-coding |
| 2795 | Efna2 | CEK7L|Elf1|Epl6|Eplg6|Lerk6 | ephrin A2 | 247 | chr10 | 80190330 | 80190907 | + | 0 | NaN | TTS (NM_007909) | TTS (NM_007909) | 11136 | NM_007909 | 13637 | Mm.1478 | NM_007909 | ENSMUSG00000003070 | protein-coding |
| 2796 | Rell2 | 4631403P03Rik|ependolin | RELT-like 2 | 1341 | chr18 | 37955007 | 37956344 | + | 0 | NaN | promoter-TSS (NM_010411) | promoter-TSS (NM_010411) | 116 | NM_153793 | 225392 | Mm.233516 | NM_153793 | ENSMUSG00000044024 | protein-coding |
| 2797 | Efna3 | AW494418|EFL-2|Ehk1-L|Epl3|LERK-3 | ephrin A3 | 1813 | chr3 | 89321744 | 89322243 | + | 0 | NaN | intron (NM_010108, intron 1 of 4) | CpG | 886 | NM_010108 | 13638 | Mm.331159 | NM_010108 | ENSMUSG00000028039 | protein-coding |
2798 rows × 19 columns
In [35]:
boolean = not homerClosestTSSperGene_df["geneName"].is_unique # True (credit to @Carsten)
In [36]:
boolean
Out[36]:
False
In [40]:
# df = your dataframe
# 1) identify log2FC columns
log2fc_cols = [c for c in df.columns if c.endswith("_log2FC")]
# 2) define threshold for a "strong" effect
# set to 0 if you want ANY non-zero effect to count
threshold = 0.0
# absolute values
abs_fc = df[log2fc_cols].abs()
# 3) scoring
df_scored = df.copy()
df_scored["_n_strong"] = (abs_fc > threshold).sum(axis=1)
df_scored["_sum_abs"] = abs_fc.sum(axis=1)
df_scored["_max_abs"] = abs_fc.max(axis=1) # optional tie-breaker
# 4) pick best row per gene
df_best = (
df_scored
.sort_values(
["geneName", "_n_strong", "_sum_abs", "_max_abs"],
ascending=[True, False, False, False]
)
.drop_duplicates("geneName", keep="first")
.drop(columns=["_n_strong", "_sum_abs", "_max_abs"])
)
# df_best now has exactly one row per geneName
In [41]:
df_best
Out[41]:
| geneName | mMEF_E15_aKOvWT_males_log2FC | mMEF_E15_aKOvWT_females_log2FC | mMEF_E15_patientKIvE16WT_females_log2FC | mMEF_E15_patientKIvE15WT_males_log2FC | mMEF_E13_clusterKOvWT_males_log2FC | mF9_OE_patientpCMV6vpSBmock_log2FC | mF9_OE_777vpSBmock_log2FC | mESC_KO_EGFPexcisedvWT_log2FC | E8_aKOvWT_males_log2FC | E8_aKOvWT_females_log2FC | E10_dKOvWT_mf_log2FC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 722 | 0610010K14Rik | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 109 | 0610039H22Rik | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 936 | 1-Mar | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6536 | 1010001N08Rik | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6639 | 11-Mar | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1285 | Zswim6 | -0.096579 | -0.039212 | 0.521084 | -0.052862 | -0.474079 | -0.035291 | -0.238596 | 0.359563 | -0.165467 | -0.040583 | 0.131327 |
| 6943 | Zxda | 3.780557 | -1.040589 | -1.336613 | 0.625054 | NaN | -4.390570 | 0.865508 | NaN | -0.308997 | -0.161825 | -0.232630 |
| 6946 | Zxdb | -0.291974 | -0.261189 | -0.222524 | -0.098669 | -0.063877 | -0.062832 | -0.114814 | 0.251703 | -0.065300 | 0.016925 | -0.000288 |
| 6173 | Zyg11a | 1.550915 | 1.519862 | -1.048086 | 0.019624 | -0.378368 | -0.039749 | -0.157374 | 0.501974 | 1.434094 | 1.037042 | 1.620861 |
| 6390 | Zzz3 | -0.072402 | 0.145015 | -0.490400 | -0.188292 | 0.091953 | 0.056168 | 0.233457 | 0.225465 | -0.071601 | 0.014532 | 0.031210 |
2798 rows × 12 columns
In [42]:
# make geneName an ordered categorical using the other dataframe
order = homerClosestTSSperGene_df["geneName"]
df_sorted = (
df_best.assign(
geneName=pd.Categorical(df_best["geneName"], categories=order, ordered=True)
)
.sort_values("geneName")
)
In [48]:
df_sorted
Out[48]:
| geneName | mMEF_E15_aKOvWT_males_log2FC | mMEF_E15_aKOvWT_females_log2FC | mMEF_E15_patientKIvE16WT_females_log2FC | mMEF_E15_patientKIvE15WT_males_log2FC | mMEF_E13_clusterKOvWT_males_log2FC | mF9_OE_patientpCMV6vpSBmock_log2FC | mF9_OE_777vpSBmock_log2FC | mESC_KO_EGFPexcisedvWT_log2FC | E8_aKOvWT_males_log2FC | E8_aKOvWT_females_log2FC | E10_dKOvWT_mf_log2FC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Car4 | 1.339315 | -1.011783 | -3.589685 | NaN | -3.683206 | -1.305721 | -0.273210 | -0.489246 | 0.524201 | 0.194931 | 0.375135 |
| 1 | Ppp3ca | 0.406624 | -0.104624 | 0.001710 | 0.450459 | -0.276674 | -0.118119 | -0.211948 | 0.231611 | -0.149899 | -0.124822 | 0.226837 |
| 2 | Nr5a1 | -4.865477 | 1.510755 | -3.627988 | -4.042709 | 1.480193 | 0.222983 | 0.401732 | 0.104631 | 0.309612 | 1.176672 | -0.184091 |
| 3 | Bbs5 | 0.998045 | 0.012541 | 0.765101 | 0.823408 | 0.262260 | -0.027248 | -0.039019 | -0.415166 | 0.031988 | 0.147180 | -0.056199 |
| 4 | Ppp3cc | 0.358830 | 0.194157 | 0.157946 | 0.003984 | 0.298265 | -0.048729 | -0.261857 | 0.509793 | 0.209369 | 0.113688 | 0.548918 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11299 | Rell1 | 0.260231 | 0.282502 | 0.037566 | -0.301917 | 0.371903 | -0.111070 | -0.219812 | -0.082160 | -0.033426 | -0.008482 | 0.019051 |
| 11300 | Car2 | -1.666589 | 0.676888 | 0.011434 | 0.868281 | -0.657800 | 0.102580 | 0.150735 | -0.158000 | 0.132282 | -0.062851 | 0.284113 |
| 11301 | Efna2 | -0.752565 | 0.018711 | -1.173163 | -1.734658 | 0.266176 | -0.028690 | 0.064358 | 0.048713 | -0.273028 | -0.010901 | -0.299740 |
| 11302 | Rell2 | 0.210392 | 0.180841 | 0.070738 | -0.472328 | 0.469701 | 0.032110 | -0.152993 | -0.254215 | 0.218178 | 0.336738 | 0.453236 |
| 11303 | Efna3 | 0.516740 | 0.057861 | 1.523769 | 0.957994 | -0.837704 | 0.041005 | 0.292736 | -0.330265 | -0.341390 | -0.029855 | 0.069505 |
2798 rows × 12 columns
In [72]:
import plotly.graph_objects as go
log2fc_cols = [c for c in df.columns if c != "geneName"]
z = df[log2fc_cols].to_numpy()
fig = go.Figure(
data=go.Heatmap(
z=z,
x=log2fc_cols,
y=df["geneName"],
zmid=0, # center colors around 0
hovertemplate="Gene=%{y}<br>Group=%{x}<br>log2FC=%{z}<extra></extra>",
colorscale='RdBu'
)
)
fig.update_yaxes(showticklabels=False)
fig.update_layout(
xaxis_title="Experimental group",
yaxis_title="homer.closestTSS_perGene.tsv",
height=1000,
width=500
)
fig.show()
In [74]:
fig = go.Figure(
go.Heatmap(
z=df[log2fc_cols].to_numpy(),
x=log2fc_cols,
y=df["geneName"],
zmin=-1,
zmax=1,
zmid=0,
hovertemplate="Gene=%{y}<br>Group=%{x}<br>log2FC=%{z}<extra></extra>",
colorscale='RdBu'
)
)
fig.update_yaxes(showticklabels=False)
fig.update_layout(
xaxis_title="Experimental group",
yaxis_title="homer.closestTSS_perGene.tsv",
height=2000,
width=1000
)
fig.show()
In [ ]: